Yushi Tang
February 26/28, 2019
Wang, Z., Gerstein, M., & Snyder, M. (2009). RNA-Seq: a revolutionary tool for transcriptomics. Nature reviews genetics, 10(1), 57.
#!/bin/bash
#SBATCH -N 1 # Number of nodes
#SBATCH -n 10 # Number of cores
#SBATCH -t 180 # Runtime in minutes (0~10080)
#SBATCH -p general # Partition
#SBATCH --mem=50000 # Total memory (varies across nodes)
#SBATCH -o star_%j.out # Standard out goes to this file
#SBATCH -e star_%j.err # Standard err goes to this file
#SBATCH --mail-type=END # Email
#SBATCH --mail-user=YOUR_EMAIL
module load gcc/4.8.2-fasrc01 STAR/2.5.0c-fasrc02
STAR --genomeDir $GENOME \
--readFilesIn $FASTQ1 $FASTQ2 \
--outFileNamePrefix $OUTDIR/ \
--outSAMprimaryFlag AllBestScore \
--outSAMtype BAM SortedByCoordinate \
--runThreadN 10 \
--alignEndsType EndToEnd sbatch STARalignment.shSalmon tutorial 1
Salmon Tutorial
Salmon tutorial 2
Salmon tutorial 3
Salmon tutorial 4
Salmon tutorial 5
Salmon tutorial 6
Salmon tutorial 7
Salmon tutorial 8
Salmon tutorial 9
Salmon tutorial 10
Salmon tutorial 11
Enjoy your Salmon
#!/bin/bash
#SBATCH -N 1 # Number of nodes
#SBATCH -n 10 # Number of cores
#SBATCH -t 180 # Runtime in minutes (0~10080)
#SBATCH -p general # Partition
#SBATCH --mem=50000 # Total memory (varies across nodes)
#SBATCH -o star_%j.out # Standard out goes to this file
#SBATCH -e star_%j.err # Standard err goes to this file
#SBATCH --mail-type=END # Email
#SBATCH --mail-user=YOUR_EMAIL
module load salmon
salmon index -t $TRANSCRIPTOME -i $INDEXsbatch createSalmonIndex.sh#!/bin/bash
#SBATCH -N 1 # Number of nodes
#SBATCH -n 10 # Number of cores
#SBATCH -t 180 # Runtime in minutes (0~10080)
#SBATCH -p general # Partition
#SBATCH --mem=50000 # Total memory (varies across nodes)
#SBATCH -o star_%j.out # Standard out goes to this file
#SBATCH -e star_%j.err # Standard err goes to this file
#SBATCH --mail-type=END # Email
#SBATCH --mail-user=YOUR_EMAIL
module load salmon
salmon quant -i $INDEX \
-l A \
-1 $FASTQ/ENCFF500PDO_sub.fastq\
-2 $FASTQ/ENCFF708KQE_sub.fastq \
-o $OUT \
--numBootstraps 100 \
-p 10 \
--gcBiassbatch Salmonalignment.sh.sf file. Copy these to your local directory (e.g. scp or fileZilla) for downstream analysis (DESeq part)DESeq2 packages. Install this via bioconductor.# Install required packages
source("https://bioconductor.org/biocLite.R")
biocLite("BiocUpgrade")
biocLite("DESeq2")
biocLite("tximport")
biocLite("EnsDb.Hsapiens.v86")
biocLite("EnsDb.Mmusculus.v79")
install.packages("rjson")library(DESeq2)
files <- grep("sf",list.files("Data"),value=TRUE)
condition <- c("4oh", "4oh", "4oh", "ctrl", "ctrl", "ctrl")
names <- c("4oh1", "4oh2", "4oh3", "ctrl1", "ctrl2", "ctrl3")
sampleTable <- data.frame(sampleName = files, fileName = files, condition = condition)library(EnsDb.Mmusculus.v79)
txdf <- transcripts(EnsDb.Mmusculus.v79, return.type="DataFrame")
tx2gene <- as.data.frame(txdf[,c("tx_id", "gene_id")])library(tximport)
txi <- tximport(file.path("Data",files), type="salmon", ignoreTxVersion = TRUE, tx2gene = tx2gene)
dds <- DESeqDataSetFromTximport(txi,colData=sampleTable,design=~condition)
dds <- dds[rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds)res <- results(dds, alpha = 0.05)
res <- res[complete.cases(res),]
res <- res[order(res$padj),]
upR <- res[(res$padj < 0.05) & (res$log2FoldChange > 0),]
downR <- res[(res$padj < 0.05) & (res$log2FoldChange < 0),]
nrow(upR)## [1] 382
nrow(downR) ## [1] 518
plotMA(res)absOrdered <- rbind(upR,downR)
absOrdered <- absOrdered[order(abs(absOrdered$log2FoldChange),decreasing = TRUE),]
mostvariable <- log2(txi$abundance[row.names(absOrdered),]+.0001)
library(gplots)
heatmap.2(mostvariable[1:100,],trace="none",col=greenred(10))#!/bin/bash
#SBATCH -N 1 # Number of nodes
#SBATCH -n 10 # Number of cores
#SBATCH -t 240 # Runtime in minutes (0~10080)
#SBATCH -p general,serial_requeue,shared # Partition
#SBATCH --mem=50000 # Total memory (varies across nodes)
#SBATCH -o salmon_%j.out # Standard out goes to this file
#SBATCH -e salmon_%j.err # Standard err goes to this file
#SBATCH --mail-type=END # Email
#SBATCH --mail-user=ytang@hsph.harvard.edu
module load salmon
salmon index -t $TRANSCRIPTOME -i $INDEXBasic steps to access the cluster
Running jobs interactively: srun command
Rename path: mv command
Useful commands for data management
Inquire path on the Odyssey
Upload scripts to the Odyssey
View script list
View specific script
View specific script
Manage current jobs
Download the output
Part I. RNA-seq analyses
library(DESeq2)
library(EnsDb.Hsapiens.v86)
library(tximport)
library(rjson)
library(Seurat)
library(dplyr)For this HW, we will use the RNA-seq data of HepG2 with U2AF1 knock down and control, each with 2 replicates. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE88002 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE88226
The raw FASTQ files are available on Odyssey at: /n/stat115/HW3_2019/FastqData/
Solution:
1). Run Salmon on the full data seperately for four pair-wise samples.
Preparing work: Build Salmon Index
This part takes around 2 min 59 seconds for Odyssey to run.
#!/bin/bash
#SBATCH -N 1 # Number of nodes
#SBATCH -n 10 # Number of cores
#SBATCH -t 240 # Runtime in minutes (0~10080)
#SBATCH -p general,serial_requeue,shared # Partition
#SBATCH --mem=50000 # Total memory (varies across nodes)
#SBATCH -o salmon_%j.out # Standard out goes to this file
#SBATCH -e salmon_%j.err # Standard err goes to this file
#SBATCH --mail-type=END # Email
#SBATCH --mail-user=ytang@hsph.harvard.edu
module load salmon
salmon index -t /n/stat115/HW3_2019/transcriptome/Homo_sapiens.GRCh38.cdna.all.fa -i ../temp/SalmonIndexsbatch Salmonindex.sha). For Control group replicate 1: ENCFF178MWG and ENCFF229VBF
This part takes around 15 min 16 seconds for Odyssey to run.
#!/bin/bash
#SBATCH -N 1 # Number of nodes
#SBATCH -n 10 # Number of cores
#SBATCH -t 240 # Runtime in minutes (0~10080)
#SBATCH -p general,serial_requeue,shared # Partition
#SBATCH --mem=50000 # Total memory (varies across nodes)
#SBATCH -o star_%j.out # Standard out goes to this file
#SBATCH -e star_%j.err # Standard err goes to this file
#SBATCH --mail-type=END # Email
#SBATCH --mail-user=ytang@hsph.harvard.edu
module load salmon
salmon quant -i ../temp/SalmonIndex \
-l A \
-1 /n/stat115/HW3_2019/FastqData/ENCFF178MWG.fastq\
-2 /n/stat115/HW3/FastqData/ENCFF229VBF.fastq \
-o ../work/control1 \
--numBootstraps 100 \
-p 10 \
--gcBiassbatch SalmonAlignmentFull1.shb). For Control group replicate 2: FASTQ1 and FASTQ2
sbatch SalmonAlignmentFull2.shc). For Mutant group replicate 1: FASTQ1 and FASTQ2
sbatch SalmonAlignmentFull3.shd). For Mutant group replicate 2: FASTQ1 and FASTQ2
sbatch SalmonAlignmentFull4.sh# Build index for control and mutant groups
files <- grep("sf",list.files("deseq_data"),value=TRUE)
condition <- c("control","control","mutant","mutant")
names <- c("control1", "control2", "mutant1", "mutant2")
sampleTable <- data.frame(sampleName = files, fileName = files, condition = condition)
# Add gene references according to EnsDb.Hsapiens.v86
txdf <- transcripts(EnsDb.Hsapiens.v86, return.type="DataFrame")
tx2gene <- as.data.frame(txdf[,c("tx_id", "gene_id")])
# Perform DESeq analysis
txi <- tximport(file.path("deseq_data",files), type="salmon", ignoreTxVersion = TRUE, tx2gene = tx2gene)
dds <- DESeqDataSetFromTximport(txi,colData=sampleTable,design=~condition)
dds <- dds[rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds)
res <- results(dds, alpha = 0.01)
res <- res[complete.cases(res),]
res <- res[order(res$padj),]
upR <- res[(res$padj < 0.01) & (res$log2FoldChange > 0),]
downR <- res[(res$padj < 0.01) & (res$log2FoldChange < 0),]
nrow(upR)
nrow(downR)
plotMA(res)# Now we write out the up-regulated and down-regulated gene symbles
# Then we performed GO analysis on DAVID with both of them
write.csv(upR@rownames, file = "Up.csv")
write.csv(downR@rownames, file = "Down.csv")Some useful gene and GO analysis tools:
UniProt: http://www.uniprot.org/uniprot/
GOA (Gene Ontology Annotation): https://www.ebi.ac.uk/GOA
AmiGO!: http://amigo1.geneontology.org/
# For transcript (no aggregate) level
library(tximport)
txi_transcript <- tximport(file.path("deseq_data",files), type="salmon",
txOut = TRUE, ignoreTxVersion = TRUE, tx2gene = tx2gene)
dds_transcript <- DESeqDataSetFromTximport(txi_transcript,colData=sampleTable,design=~condition)
dds_transcript <- dds_transcript[rowSums(counts(dds_transcript)) > 1, ]
dds_transcript <- DESeq(dds_transcript)
res_transcript <- results(dds_transcript, alpha = 0.01)
res_transcript <- res_transcript[complete.cases(res_transcript),]
res_transcript <- res_transcript[order(res_transcript$padj),]
upR_transcript <- res_transcript[(res_transcript$padj < 0.01) & (res_transcript$log2FoldChange > 0),]
downR_transcript <- res_transcript[(res_transcript$padj < 0.01) & (res_transcript$log2FoldChange < 0),]
nrow(upR_transcript)
nrow(downR_transcript)# Write out the files for DAVID analysis
write.csv(upR_transcript@rownames, file = "UpTranscript.csv")
write.csv(downR_transcript@rownames, file = "DownTranscript.csv")
# Perform GO analysis with DAVID# For gene level (aggregate)
library(tximport)
txi_gene <- tximport(file.path("deseq_data",files), type="salmon",
txOut = FALSE, ignoreTxVersion = TRUE, tx2gene = tx2gene)
dds_gene <- DESeqDataSetFromTximport(txi_gene,colData=sampleTable,design=~condition)
## Similar analysis as previous part# Write out the files for DAVID analysis
write.csv(upR_gene@rownames, file = "UpGene.csv")
write.csv(downR_gene@rownames, file = "DownGene.csv")
# Perform GO analysis with DAVIDtxi_gene_tpm <- txi_gene
txi_gene_tpm$counts = txi_gene_tpm$abundance
dds_gene_tpm <- DESeqDataSetFromTximport(txi_gene_tpm,colData=sampleTable,design=~condition)
dds_gene_tpm <- dds_gene_tpm[rowSums(counts(dds_gene_tpm)) > 1, ]
dds_gene_tpm <- DESeq(dds_gene_tpm)
# The following analysis are similar as the previous partPlease submit your solution directly on the canvas website. Please provide both your code in this Rmd document and an html file for your final write-up. Please pay attention to the clarity and cleanness of your homework.
The teaching fellows will grade your homework and give the grades with feedback through canvas within one week after the due date. Some of the questions might not have a unique or optimal solution. TFs will grade those according to your creativity and effort on exploration, especially in the graduate-level questions.